Load packages
library(tidyverse)
library(gridExtra)
library(cowplot)
library(viridis)
library(ggridges)
library(ggstance)
library(treeio)
library(ggtree)
library(tidytree)
Load and combine data
studies <- read_csv("../data/studies_gsheet.csv")
species <- read_csv("../data/species_gsheet.csv")
fulltree <- read.nexus("../data/consensusTree_10kTrees_298Primates_V3.nex")
refs <- read_csv("../data/ref_nodes.csv")
data <- studies %>%
left_join(species, by = c("species" = "species_file")) %>%
rename(label = species_tree, label2 = updated_nomenclature, num = n) %>%
mutate(label2 = ifelse(is.na(label2), label, label2)) %>%
select(label, label2, studyID, species, site, num)
data2 <- data %>%
group_by(label, label2, species) %>%
summarise(Nsites = n_distinct(site), Nstudies = n_distinct(studyID)) %>%
mutate(div_score = Nsites/Nstudies)
# turn tree into tidy dataframe
tree2 <- fulltree %>%
drop.tip(c("Pan_troglodytes_schweinfurthii", "Pan_troglodytes_troglodytes",
"Pan_troglodytes_vellerosus", "Pongo_pygmaeus", "Cercopithecus_cephus_cephus",
"Cercopithecus_erythrogaster_erythrogaster", "Eulemur_fulvus_mayottensis",
"Hapalemur_griseus_griseus", "Microcebus_lokobensis", "Lepilemur_mitsinjoensis")) %>%
as_tibble
inner_nodes <- c(289:298, 307:308, 313:319, 321:325, 327, 329:346, 349:351, 355:357, 359, 375:383, 388, 392:395, 397:403, 405:416, 423:426, 428:432, 438:442, 446:448, 450:454, 457:464, 466:471, 473:477, 481:492, 504, 522:523, 527:529, 532:536, 542:544, 550:552, 555, 557)
tree3 <- tree2 %>%
mutate(label = fct_recode(label,
"Pongo_spp" = "Pongo_abelii",
"Pan_troglodytes" = "Pan_troglodytes_verus")) %>%
left_join(data2) %>%
left_join(species, by = c("label" = "species_tree")) %>%
select(-label, -label2, -species_file) %>%
rename(label = updated_nomenclature) %>%
mutate(
hasN = ifelse(is.na(Nstudies), .1, .5), # used to size branches + color the tip labels
hasN2 = ifelse(is.na(Nstudies) & !(node %in% inner_nodes), 0, .5), # used to color branches
label = str_replace_all(label, "_", " "),
label = ifelse(is.na(label) & species == "orangutan", "Pongo spp",
ifelse(is.na(label) & species == "chimpanzee", "Pan troglodytes", label))) %>%
left_join(refs) %>%
groupClade(refs$node[-1]) %>%
mutate(group = fct_recode(group, "2" = "1"))
Joining, by = "label"
Column `label` joining factor and character vector, coercing into character vectorJoining, by = "node"
# turn back into tree
tree4 <- as.treedata(tree3)
Circular tree of the 10ktree primates
cols <- viridis(4, end = .9)
p <- ggtree(tree4, aes(alpha = hasN2), layout = "circular") + # size = hasN,
# highlight clades with background colors
geom_hilight(node = 478, fill = cols[1], alpha = .3) +
geom_hilight(node = 481, fill = cols[1], alpha = .3) +
geom_hilight(node = 414, fill = cols[2], alpha = .3) +
geom_hilight(node = 293, fill = cols[3], alpha = .3) +
geom_hilight(node = 397, fill = cols[4], alpha = .3) +
# plot tree again to be on top of the highlights
geom_tree() +
# root
geom_rootpoint(size = 1) +
# tips
geom_tippoint(aes(size = Nstudies), alpha = .7) +
geom_tiplab2(aes(alpha = hasN), offset = 3, size = 3) +
# tweak scales
scale_alpha_continuous(range = c(.2, 1)) +
scale_size_area(max_size = 15) +
# widen plotting area
xlim(NA, 100)
p <- rotate(p, 292)
pcol <- ggplot(tibble(cols = cols, x = 1:4), aes(x, y = 1, col = cols)) +
geom_point(size = 6, alpha = .3) +
scale_color_identity("Clade", guide = "legend", breaks = cols[4:1],
labels = c("Hominoidea", "Cercopithecoidea", "Platyrrhini",
"Tarsiiformes & Strepsirrhini")) +
theme_cowplot()
l1 <- get_legend(pcol)
psize <- ggplot(data2, aes(size = Nstudies, x = 1, y = 1)) +
geom_point(alpha = .7) +
scale_size_area("Number of Studies", max_size = 15) +
theme_cowplot()
l2 <- get_legend(psize)
px <- plot_grid(p, plot_grid(NA, l1, l2, NA, ncol = 1, rel_heights = c(.3, .15, .15, .3)), NA,
nrow = 1, rel_widths = c(1, .2, .1))
Cannot convert object of class logical into a grob.Cannot convert object of class logical into a grob.Removed 219 rows containing missing values (geom_point_g_gtree).Cannot convert object of class logical into a grob.
px

ggsave("../graphs/phylo_full.pdf", px, width = 7, height = 5.5, scale = 2)
ggsave("../graphs/phylo_full.png", width = 7, height = 5.5, scale = 2)
ggsave("../graphs/phylo_full.tiff", width = 7, height = 5.5, scale = 2, type = "cairo",
compression = "lzw")
# to figure out node numbers
n1 <- p + geom_text(aes(label = node, x = branch), size = 2, col = "blue", vjust = -.5)
ggsave("../graphs/full_tree_nodes_circular.pdf", n1, width = 8, height = 8, scale = 2)
n2 <- ggtree(tree4, aes(size = hasN, alpha = hasN2)) +
# highlight clades with background colors
geom_hilight(node = 478, fill = cols[1], alpha = .3) +
geom_hilight(node = 481, fill = cols[1], alpha = .3) +
geom_hilight(node = 414, fill = cols[2], alpha = .3) +
geom_hilight(node = 293, fill = cols[3], alpha = .3) +
geom_hilight(node = 397, fill = cols[4], alpha = .3) +
# plot tree again to be on top of the highlights
geom_tree() +
# root
geom_rootedge(rootedge = 2) +
geom_rootpoint(size = 1) +
# node labels
geom_text(aes(label = node, x = branch), size = 2, col = "blue", vjust = -.5) +
# tips
geom_tippoint(aes(size = Nstudies), alpha = .7) +
geom_tiplab(aes(alpha = hasN), offset = 1.8, size = 3) +
# tweak scales
scale_alpha_continuous(range = c(.2, 1)) +
scale_size_continuous(range = c(.5, 15)) +
# widen plotting area
expand_limits(x = 90) +
theme_tree2()
ggsave("../graphs/full_tree_nodes.pdf", n2, width = 8, height = 20, scale = 2)
Sample size in detail
# subset tree to just those species who have sample sizes reported, i.e. those who were tested
to_drop <- tree3 %>% filter(is.na(Nstudies)) %>% pull(label)
tree5 <- drop.tip(tree4, to_drop)
d3 <- data %>%
mutate(label = str_replace_all(label2, "_", " ")) %>%
group_by(label, species, studyID) %>%
summarise(num = sum(num))
d3 %>% arrange(desc(num))
# filter super large samples out for visualization? note in caption
# species with more than X sites can get a density
d3a <- d3 %>% group_by(species) %>% filter(n_distinct(studyID) >= 4, num <= 200)
d3b <- d3 %>% # setdiff(d3, d3a) %>% ## <- to NOT show points for densities
group_by(species) %>%
# create variable num2 is NA if there's only one data point for a species
# --> those species will only get the vertical crossbar
mutate(flag = n_distinct(studyID) == 1) %>%
ungroup %>%
mutate(num2 = ifelse(flag, NA, num)) %>%
filter(num <= 200)
# for vertical crossbar = median
d4 <- d3 %>%
group_by(label, species) %>%
summarise(Mdn = median(num, na.rm = T)) # totalN = sum(num), sitesN = n_distinct(site)
# for vertical line in ridge plot (grand median)
# + hacky way to make horizontal grid lines for right panel only
v <- tibble(reference = c(NA, median(d3$num, na.rm = T)), .panel = c("Tree", "xSample size"))
h <- tibble(reference = c(NA, 1:Ntip(tree5)), .panel = c("Tree", rep("xSample size", Ntip(tree5))))
# for axis labels
ax <- tibble(lab = c("Distance (Millions of years)", "Sample size"),
x = c(60, 100), y = -4, .panel = c("Tree", "xSample size"))
# Nsites/studies labels
Nlab <- tibble(lab = c("# Sites", "# Studies"), x = c(125, 136), y = Ntip(tree5) + 1,
.panel = "Tree")
# LEFT FACET
q <- ggtree(tree5, aes(col = group)) +
# root
geom_rootedge(rootedge = 5) +
# tip labels
geom_tippoint(aes(size = Nstudies), shape = 21, fill = "white") +
geom_tippoint(aes(size = Nsites), stroke = 0, alpha = .8) +
# geom_tiplab(aes(label = str_c(label, " (", Nsites, "/", Nstudies, ")")), offset = 4, size = 3) +
geom_tiplab(offset = 4, size = 3) +
geom_text(aes(label = Nsites), x = 135, hjust = 1, size = 3) +
geom_text(aes(label = Nstudies), x = 142, hjust = 1, size = 3) +
# tweak scales
scale_color_manual(values = c("grey30", cols)) +
scale_fill_manual(values = cols) +
scale_size_area(max_size = 8) +
# display timescale at the bottom
theme_tree2() +
xlim_tree(142) +
xlim_expand(c(0, 175), "xSample size") +
# add axis + Nstudies/sites labels
geom_text(data = ax, aes(label = lab), col = "black") +
geom_text(data = Nlab, aes(label = lab), col = "black", size = 2.5) +
scale_x_continuous(expand = expand_scale(mult = c(0, .01))) +
scale_y_continuous(limits = c(2, Ntip(tree5)-1), oob = function(x, ...) x) +
coord_cartesian(clip = "off") +
# add reference lines (these will show up on right panel of facet_plot only)
geom_hline(data = h, aes(yintercept = reference), lwd = .2, col = "grey", alpha = .5) +
geom_vline(data = v, aes(xintercept = reference), lwd = 1.5, col = "grey", alpha = .3) +
# remove facet strips, expand bottom margin (to make space for x axis labels)
theme(strip.text = element_blank(), strip.background = element_blank(),
plot.margin = unit(c(1, 1, 2, 1.5), "cm"), panel.spacing = unit(1, "cm"))
q <- rotate(q, 72)
# right-side viz depends on the number of sites per species:
# 1 site = vertical crossbar only
# 2+ sites = points + crossbar at median
# X+ sites = densities (currently, X = 4 just to illustrate)
# dirty hack: x in front of "Sample size" is to have that panel sort to the right (alphabetically) until I figure out why it doesn't just go by order. This cropped up as an issue when I added the dummy point for the x-axis expansion...
# ADD RIGHT FACET
qx <- q %>%
# densities for species with enough sites
facet_plot("xSample size", d3a, geom_density_ridges,
aes(x = num, group = label, fill = group, height = ..density..),
alpha = .5, lwd = .3, scale = .3) %>%
# vertical crossbar for Mdn
facet_plot("xSample size", d4, geom_crossbarh, aes(x = Mdn, xmin = Mdn, xmax = Mdn, group = label,
col = group), alpha = .5, width = .6, fatten = 1.5) %>%
# vertical mark for individual sites
facet_plot("xSample size", d3b, geom_jitter, aes(x = num2, group = label), shape = "|", size = 2.5,
width = .5, height = 0, alpha = .5)
# add legends
psize <-
ggplot(data2, aes(x = 1, y = 1)) +
geom_point(aes(size = Nstudies), col = NA) +
geom_point(aes(size = Nsites), stroke = 0, alpha = .8) +
scale_size_area("Number of Sites", max_size = 8, breaks = c(1, 5, 10, 25, 50)) +
theme_cowplot()
psize2 <-
ggplot(data2, aes(x = 1, y = 1)) +
geom_point(aes(size = Nstudies), shape = 21, fill = "white") +
scale_size_area("\nNumber of Studies", max_size = 8, breaks = c(1, 5, 10, 25, 50, 100)) +
theme_cowplot()
l2 <- get_legend(psize)
Removed 69 rows containing missing values (geom_point).
l3 <- get_legend(psize2)
qx2 <- plot_grid(qx, plot_grid(NA, l1, l2, l3, NA, ncol = 1, rel_heights = c(.3, .1, .1, .1, .3)), NA,
nrow = 1, rel_widths = c(1, .2, .1))
Cannot convert object of class logical into a grob.Cannot convert object of class logical into a grob.Picking joint bandwidth of NaN
no non-missing arguments to min; returning Infno non-missing arguments to max; returning -InfPicking joint bandwidth of 4.06
no non-missing arguments to max; returning -InfRemoved 67 rows containing missing values (geom_text).Removed 67 rows containing missing values (geom_text).Removed 1 rows containing missing values (geom_hline).Removed 1 rows containing missing values (geom_vline).Removed 1 rows containing missing values (geom_crossbarh).Removed 32 rows containing missing values (geom_point).Cannot convert object of class logical into a grob.
qx2

ggsave("../graphs/phylo_ridge_site.pdf", width = 8, height = 8, scale = 2)
ggsave("../graphs/phylo_ridge_site.png", width = 8, height = 8, scale = 2)
ggsave("../graphs/phylo_ridge_site.tiff", width = 8, height = 8, scale = 2, type = "cairo",
compression = "lzw")
Diversity score
# subset tree to just those species who have sample sizes reported, i.e. those who were tested
to_drop <- tree3 %>% filter(is.na(Nstudies) | Nstudies < 2) %>% pull(label)
tree6 <- drop.tip(tree4, to_drop)
ggtree(tree6, aes(col = group)) +
# root
geom_rootedge(rootedge = 5) +
# tip labels
geom_tippoint(aes(size = Nstudies), shape = 21, fill = "white") +
geom_tippoint(aes(size = Nsites), stroke = 0, alpha = .8) +
geom_tiplab(offset = 4, size = 3) +
geom_text(aes(label = Nsites), x = 113, hjust = 1, size = 3) +
geom_text(aes(label = Nstudies), x = 120, hjust = 1, size = 3) +
# tweak scales
scale_color_manual(values = c("grey30", cols)) +
scale_fill_manual(values = cols) +
scale_size_area(max_size = 8) +
# display timescale at the bottom
theme_tree2() +
xlim_tree(120) +
xlab("Distance (Millions of years)")
# ggsave("../graphs/phylo_div_score.pdf", width = 4, height = 4.5, scale = 2)
Session info
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidytree_0.2.8 ggtree_1.16.6 treeio_1.8.2 ggstance_0.3.3 ggridges_0.5.1
[6] viridis_0.5.1 viridisLite_0.3.0 cowplot_1.0.0 gridExtra_2.3 forcats_0.4.0
[11] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 tidyr_1.0.0
[16] tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] tidyselect_0.2.5 xfun_0.10 reshape2_1.4.3 haven_2.1.1 lattice_0.20-38
[6] colorspace_1.4-1 vctrs_0.2.0 generics_0.0.2 rlang_0.4.0 pillar_1.4.2
[11] glue_1.3.1 withr_2.1.2 modelr_0.1.5 readxl_1.3.1 rvcheck_0.1.5
[16] lifecycle_0.1.0 plyr_1.8.4 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
[21] rvest_0.3.4 labeling_0.3 knitr_1.25 parallel_3.6.1 broom_0.5.2
[26] Rcpp_1.0.2 BiocManager_1.30.7 scales_1.0.0 backports_1.1.5 jsonlite_1.6
[31] digest_0.6.21 hms_0.5.1 stringi_1.4.3 grid_3.6.1 cli_1.1.0
[36] tools_3.6.1 magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 ape_5.3
[41] pkgconfig_2.0.3 zeallot_0.1.0 xml2_1.2.2 lubridate_1.7.4 assertthat_0.2.1
[46] httr_1.4.1 rstudioapi_0.10 R6_2.4.0 nlme_3.1-141 compiler_3.6.1
---
title: "Phylogenetic Tree"
output: 
  html_notebook:
    css: style.css
    theme: paper
---

Load packages

```{r, message=FALSE}
library(tidyverse)
library(gridExtra)
library(cowplot)
library(viridis)
library(ggridges)
library(ggstance)
library(treeio)
library(ggtree)
library(tidytree)
```

Load and combine data

```{r, message=FALSE}
studies <- read_csv("../data/studies_gsheet.csv")
species <- read_csv("../data/species_gsheet.csv")
fulltree <- read.nexus("../data/consensusTree_10kTrees_298Primates_V3.nex")
refs <- read_csv("../data/ref_nodes.csv")
```

```{r}
data <- studies %>% 
  left_join(species, by = c("species" = "species_file")) %>% 
  rename(label = species_tree, label2 = updated_nomenclature, num = n) %>% 
  mutate(label2 = ifelse(is.na(label2), label, label2)) %>% 
  select(label, label2, studyID, species, site, num)
```

```{r}
data2 <- data %>%
  group_by(label, label2, species) %>% 
  summarise(Nsites = n_distinct(site), Nstudies = n_distinct(studyID)) %>% 
  mutate(div_score = Nsites/Nstudies)
```

```{r}
# turn tree into tidy dataframe
tree2 <- fulltree %>% 
  drop.tip(c("Pan_troglodytes_schweinfurthii", "Pan_troglodytes_troglodytes",
             "Pan_troglodytes_vellerosus", "Pongo_pygmaeus", "Cercopithecus_cephus_cephus",
             "Cercopithecus_erythrogaster_erythrogaster", "Eulemur_fulvus_mayottensis",
             "Hapalemur_griseus_griseus", "Microcebus_lokobensis", "Lepilemur_mitsinjoensis")) %>%
  as_tibble

inner_nodes <- c(289:298, 307:308, 313:319, 321:325, 327, 329:346, 349:351, 355:357, 359, 375:383, 388, 392:395, 397:403, 405:416, 423:426, 428:432, 438:442, 446:448, 450:454, 457:464, 466:471, 473:477, 481:492, 504, 522:523, 527:529, 532:536, 542:544, 550:552, 555, 557)

tree3 <- tree2 %>% 
  mutate(label = fct_recode(label, 
                            "Pongo_spp" = "Pongo_abelii",
                            "Pan_troglodytes" = "Pan_troglodytes_verus")) %>% 
  left_join(data2) %>% 
  left_join(species, by = c("label" = "species_tree")) %>% 
  select(-label, -label2, -species_file) %>%
  rename(label = updated_nomenclature) %>%
  mutate(
    hasN = ifelse(is.na(Nstudies), .1, .5), # used to size branches + color the tip labels
    hasN2 = ifelse(is.na(Nstudies) & !(node %in% inner_nodes), 0, .5), # used to color branches
    label = str_replace_all(label, "_", " "),
    label = ifelse(is.na(label) & species == "orangutan", "Pongo spp", 
                   ifelse(is.na(label) & species == "chimpanzee", "Pan troglodytes", label))) %>% 
  left_join(refs) %>% 
  groupClade(refs$node[-1]) %>% 
  mutate(group = fct_recode(group, "2" = "1"))

# turn back into tree
tree4 <- as.treedata(tree3)
```

# Circular tree of the 10ktree primates

```{r}
cols <- viridis(4, end = .9)
```

```{r}
p <- ggtree(tree4, aes(alpha = hasN2), layout = "circular") + # size = hasN, 
  # highlight clades with background colors
  geom_hilight(node = 478, fill = cols[1], alpha = .3) +
  geom_hilight(node = 481, fill = cols[1], alpha = .3) +
  geom_hilight(node = 414, fill = cols[2], alpha = .3) +
  geom_hilight(node = 293, fill = cols[3], alpha = .3) +
  geom_hilight(node = 397, fill = cols[4], alpha = .3) +
  # plot tree again to be on top of the highlights
  geom_tree() +
  # root
  geom_rootpoint(size = 1) +
  # tips
  geom_tippoint(aes(size = Nstudies), alpha = .7) +
  geom_tiplab2(aes(alpha = hasN), offset = 3, size = 3) +
  # tweak scales
  scale_alpha_continuous(range = c(.2, 1)) +
  scale_size_area(max_size = 15) +
  # widen plotting area
  xlim(NA, 100)

p <- rotate(p, 292)
```

```{r}
pcol <- ggplot(tibble(cols = cols, x = 1:4), aes(x, y = 1, col = cols)) +
  geom_point(size = 6, alpha = .3) +
  scale_color_identity("Clade", guide = "legend", breaks = cols[4:1], 
                       labels = c("Hominoidea", "Cercopithecoidea", "Platyrrhini", 
                                  "Tarsiiformes & Strepsirrhini")) +
  theme_cowplot()

l1 <- get_legend(pcol)
```

```{r}
psize <- ggplot(data2, aes(size = Nstudies, x = 1, y = 1)) +
  geom_point(alpha = .7) +
  scale_size_area("Number of Studies", max_size = 15) +
  theme_cowplot()

l2 <- get_legend(psize)
```

```{r, fig.width=7, fig.height=5.5, cache=TRUE}
px <- plot_grid(p, plot_grid(NA, l1, l2, NA, ncol = 1, rel_heights = c(.3, .15, .15, .3)), NA,
          nrow = 1, rel_widths = c(1, .2, .1))

px
```

```{r, cache=TRUE}
ggsave("../graphs/phylo_full.pdf", px, width = 7, height = 5.5, scale = 2)
ggsave("../graphs/phylo_full.png", width = 7, height = 5.5, scale = 2)
ggsave("../graphs/phylo_full.tiff", width = 7, height = 5.5, scale = 2, type = "cairo", 
       compression = "lzw")
```

```{r, fig.width=8, fig.height=8, cache=TRUE, eval=FALSE}
# to figure out node numbers
n1 <- p + geom_text(aes(label = node, x = branch), size = 2, col = "blue", vjust = -.5)
ggsave("../graphs/full_tree_nodes_circular.pdf", n1, width = 8, height = 8, scale = 2)
```

```{r, fig.width=8, fig.height=20, cache=TRUE, eval=FALSE}
n2 <- ggtree(tree4, aes(size = hasN, alpha = hasN2)) +
  # highlight clades with background colors
  geom_hilight(node = 478, fill = cols[1], alpha = .3) +
  geom_hilight(node = 481, fill = cols[1], alpha = .3) +
  geom_hilight(node = 414, fill = cols[2], alpha = .3) +
  geom_hilight(node = 293, fill = cols[3], alpha = .3) +
  geom_hilight(node = 397, fill = cols[4], alpha = .3) +
  # plot tree again to be on top of the highlights
  geom_tree() +
  # root
  geom_rootedge(rootedge = 2) +
  geom_rootpoint(size = 1) +
  # node labels
  geom_text(aes(label = node, x = branch), size = 2, col = "blue", vjust = -.5) +
  # tips
  geom_tippoint(aes(size = Nstudies), alpha = .7) +
  geom_tiplab(aes(alpha = hasN), offset = 1.8, size = 3) +
  # tweak scales
  scale_alpha_continuous(range = c(.2, 1)) +
  scale_size_continuous(range = c(.5, 15)) +
  # widen plotting area
  expand_limits(x = 90) +
  theme_tree2()

ggsave("../graphs/full_tree_nodes.pdf", n2, width = 8, height = 20, scale = 2)
```


# Sample size in detail

```{r}
# subset tree to just those species who have sample sizes reported, i.e. those who were tested
to_drop <- tree3 %>% filter(is.na(Nstudies)) %>% pull(label)
tree5 <- drop.tip(tree4, to_drop)
d3 <- data %>% 
  mutate(label = str_replace_all(label2, "_", " ")) %>% 
  group_by(label, species, studyID) %>% 
  summarise(num = sum(num))
```

```{r}
d3 %>% arrange(desc(num))
```

```{r}
# filter super large samples out for visualization? note in caption
# species with more than X sites can get a density
d3a <- d3 %>% group_by(species) %>% filter(n_distinct(studyID) >= 4, num <= 200)
d3b <- d3 %>% # setdiff(d3, d3a) %>% ## <- to NOT show points for densities
  group_by(species) %>% 
  # create variable num2 is NA if there's only one data point for a species
  # --> those species will only get the vertical crossbar
  mutate(flag = n_distinct(studyID) == 1) %>% 
  ungroup %>% 
  mutate(num2 = ifelse(flag, NA, num)) %>% 
  filter(num <= 200)

# for vertical crossbar = median
d4 <- d3 %>% 
  group_by(label, species) %>% 
  summarise(Mdn = median(num, na.rm = T)) # totalN = sum(num), sitesN = n_distinct(site)

# for vertical line in ridge plot (grand median)
# + hacky way to make horizontal grid lines for right panel only
v <- tibble(reference = c(NA, median(d3$num, na.rm = T)), .panel = c("Tree", "xSample size"))
h <- tibble(reference = c(NA, 1:Ntip(tree5)), .panel = c("Tree", rep("xSample size", Ntip(tree5))))

# for axis labels
ax <- tibble(lab = c("Distance (Millions of years)", "Sample size"), 
             x = c(60, 100), y = -4, .panel = c("Tree", "xSample size"))

# Nsites/studies labels
Nlab <- tibble(lab = c("# Sites", "# Studies"), x = c(125, 136), y = Ntip(tree5) + 1, 
             .panel = "Tree")
```

```{r, cache=TRUE}
# LEFT FACET
q <- ggtree(tree5, aes(col = group)) +
  # root
  geom_rootedge(rootedge = 5) +
  # tip labels
  geom_tippoint(aes(size = Nstudies), shape = 21, fill = "white") +
  geom_tippoint(aes(size = Nsites), stroke = 0, alpha = .8) +
  # geom_tiplab(aes(label = str_c(label, " (", Nsites, "/", Nstudies, ")")), offset = 4, size = 3) +
  geom_tiplab(offset = 4, size = 3) +
  geom_text(aes(label = Nsites), x = 135, hjust = 1, size = 3) +
  geom_text(aes(label = Nstudies), x = 142, hjust = 1, size = 3) +
  # tweak scales
  scale_color_manual(values = c("grey30", cols)) +
  scale_fill_manual(values = cols) +
  scale_size_area(max_size = 8) +
  # display timescale at the bottom
  theme_tree2() +
  xlim_tree(142) +
  xlim_expand(c(0, 175), "xSample size") +
  # add axis + Nstudies/sites labels
  geom_text(data = ax, aes(label = lab), col = "black") +
  geom_text(data = Nlab, aes(label = lab), col = "black", size = 2.5) +
  scale_x_continuous(expand = expand_scale(mult = c(0, .01))) +
  scale_y_continuous(limits = c(2, Ntip(tree5)-1), oob = function(x, ...) x) +
  coord_cartesian(clip = "off") +
  # add reference lines (these will show up on right panel of facet_plot only)
  geom_hline(data = h, aes(yintercept = reference), lwd = .2, col = "grey", alpha = .5) +
  geom_vline(data = v, aes(xintercept = reference), lwd = 1.5, col = "grey", alpha = .3) +
  # remove facet strips, expand bottom margin (to make space for x axis labels)
  theme(strip.text = element_blank(), strip.background = element_blank(),
        plot.margin = unit(c(1, 1, 2, 1.5), "cm"), panel.spacing = unit(1, "cm"))

q <- rotate(q, 72)
```

```{r, fig.width=6, fig.height=8, cache=TRUE}
# right-side viz depends on the number of sites per species:
# 1 site = vertical crossbar only
# 2+ sites = points + crossbar at median
# X+ sites = densities (currently, X = 4 just to illustrate)

# dirty hack: x in front of "Sample size" is to have that panel sort to the right (alphabetically) until I figure out why it doesn't just go by order. This cropped up as an issue when I added the dummy point for the x-axis expansion...

# ADD RIGHT FACET
qx <- q %>% 
  # densities for species with enough sites
  facet_plot("xSample size", d3a, geom_density_ridges, 
             aes(x = num, group = label, fill = group, height = ..density..),
             alpha = .5, lwd = .3, scale = .3) %>%
  # vertical crossbar for Mdn
  facet_plot("xSample size", d4, geom_crossbarh, aes(x = Mdn, xmin = Mdn, xmax = Mdn, group = label,
             col = group), alpha = .5, width = .6, fatten = 1.5) %>%
  # vertical mark for individual sites
  facet_plot("xSample size", d3b, geom_jitter, aes(x = num2, group = label), shape = "|", size = 2.5,
             width = .5, height = 0, alpha = .5)
```

```{r}
# add legends
psize <-
  ggplot(data2, aes(x = 1, y = 1)) +
  geom_point(aes(size = Nstudies), col = NA) +
  geom_point(aes(size = Nsites), stroke = 0, alpha = .8) +
  scale_size_area("Number of Sites", max_size = 8, breaks = c(1, 5, 10, 25, 50)) +
  theme_cowplot()

psize2 <-
  ggplot(data2, aes(x = 1, y = 1)) +
  geom_point(aes(size = Nstudies), shape = 21, fill = "white") +
  scale_size_area("\nNumber of Studies", max_size = 8, breaks = c(1, 5, 10, 25, 50, 100)) +
  theme_cowplot()

l2 <- get_legend(psize)
l3 <- get_legend(psize2)
```

```{r, fig.width=8, fig.height=8, cache=TRUE}
qx2 <- plot_grid(qx, plot_grid(NA, l1, l2, l3, NA, ncol = 1, rel_heights = c(.3, .1, .1, .1, .3)), NA,
          nrow = 1, rel_widths = c(1, .2, .1))

qx2
```

```{r}
ggsave("../graphs/phylo_ridge_site.pdf", width = 8, height = 8, scale = 2)
ggsave("../graphs/phylo_ridge_site.png", width = 8, height = 8, scale = 2)
ggsave("../graphs/phylo_ridge_site.tiff", width = 8, height = 8, scale = 2, type = "cairo", 
       compression = "lzw")
```

# Diversity score

```{r, eval=FALSE}
# subset tree to just those species who have sample sizes reported, i.e. those who were tested
to_drop <- tree3 %>% filter(is.na(Nstudies) | Nstudies < 2) %>% pull(label)
tree6 <- drop.tip(tree4, to_drop)
```

```{r, fig.width=4, fig.height=5, cache=TRUE, eval=FALSE}
ggtree(tree6, aes(col = group)) +
  # root
  geom_rootedge(rootedge = 5) +
  # tip labels
  geom_tippoint(aes(size = Nstudies), shape = 21, fill = "white") +
  geom_tippoint(aes(size = Nsites), stroke = 0, alpha = .8) +
  geom_tiplab(offset = 4, size = 3) +
  geom_text(aes(label = Nsites), x = 113, hjust = 1, size = 3) +
  geom_text(aes(label = Nstudies), x = 120, hjust = 1, size = 3) +
  # tweak scales
  scale_color_manual(values = c("grey30", cols)) +
  scale_fill_manual(values = cols) +
  scale_size_area(max_size = 8) +
  # display timescale at the bottom
  theme_tree2() +
  xlim_tree(120) +
  xlab("Distance (Millions of years)")
```

```{r}
# ggsave("../graphs/phylo_div_score.pdf", width = 4, height = 4.5, scale = 2)
```

# Session info

```{r}
sessionInfo()
```

